import pandas as pd
import os
import json

"""
验证数据是否缺失
"""

format_str = "%Y-%m-%d %H:%M:%S"
cfg = json.load(open(os.path.join(os.path.expanduser('~'), ".jupyter", "config.json"), 'r'))

workspaces = ["ws"]
assets = ["spot", "future"]
timeframe = ["5m", "15m", "1h"]

for ws in workspaces:
    dir_path = cfg[ws]
    data_path = os.path.join(dir_path, "data/binance/market")

    for asset in assets:
        asset_dir = os.path.join(data_path, asset)

        if not os.path.exists(asset_dir):
            continue
        for tf in timeframe:
            csv_path = os.path.join(asset_dir, f"{tf}")

            if not os.path.exists(csv_path):
                continue

            files = [e[0:-4] for e in os.listdir(csv_path) if e.endswith(".csv")]

            for idx, symbol in enumerate(sorted(files)):
                file_path = os.path.join(csv_path, f"{symbol}.csv")
                if not os.path.exists(file_path):
                    continue

                # if ws == "sync" and asset == "spot" and tf == "1m" and idx <= 159:
                #     continue

                df = pd.read_csv(file_path)


                start_date, end_date = df.iloc[0]["datetime"], df.iloc[-1]["datetime"]

                base_index = pd.date_range(start_date, end_date, freq=tf.replace("m", "T"))

                base_list = base_index.strftime(format_str).tolist()

                if len(df[df["timestamp"].isna()]) > 0:
                    print(f"{ws}-{asset}-{tf}-{symbol}-[{idx}/{len(files)}] NA error")

                data_list = df["datetime"].tolist()
                if base_list != data_list:
                    delta = list(set(base_list) - set(data_list))
                    print(f"{ws}-{asset}-{tf}-{symbol}-[{idx}/{len(files)}] error: len: {len(delta)}, [{delta[0]}-{delta[-1]}]")
                else:
                    print(f"{ws}-{asset}-{tf}-{symbol}-[{idx}/{len(files)}] has checked.")
